#Data Preprocessing: ##Replace all the missing values with NA.
library(plotly)
package 㤼㸱plotly㤼㸲 was built under R version 3.6.3Loading required package: ggplot2
package 㤼㸱ggplot2㤼㸲 was built under R version 3.6.3
Attaching package: 㤼㸱plotly㤼㸲
The following object is masked from 㤼㸱package:ggplot2㤼㸲:
last_plot
The following object is masked from 㤼㸱package:stats㤼㸲:
filter
The following object is masked from 㤼㸱package:graphics㤼㸲:
layout
##Let’s take a look at the data structure
str(censusData)
'data.frame': 30162 obs. of 15 variables:
$ age : int 39 50 38 53 28 37 49 52 31 42 ...
$ workclass : Factor w/ 7 levels "Federal-gov",..: 6 5 3 3 3 3 3 5 3 3 ...
$ fnlwgt : int 77516 83311 215646 234721 338409 284582 160187 209642 45781 159449 ...
$ education : Factor w/ 16 levels "10th","11th",..: 10 10 12 2 10 13 7 12 13 10 ...
$ education.num : int 13 13 9 7 13 14 5 9 14 13 ...
$ marital.status: Factor w/ 7 levels "Divorced","Married-AF-spouse",..: 5 3 1 3 3 3 4 3 5 3 ...
$ occupation : Factor w/ 14 levels "Adm-clerical",..: 1 4 6 6 10 4 8 4 10 4 ...
$ relationship : Factor w/ 6 levels "Husband","Not-in-family",..: 2 1 2 1 6 6 2 1 2 1 ...
$ race : Factor w/ 5 levels "Amer-Indian-Eskimo",..: 5 5 5 3 3 5 3 5 5 5 ...
$ sex : Factor w/ 2 levels "Female","Male": 2 2 2 2 1 1 1 2 1 2 ...
$ capital.gain : int 2174 0 0 0 0 0 0 0 14084 5178 ...
$ capital.loss : int 0 0 0 0 0 0 0 0 0 0 ...
$ hours.per.week: int 40 13 40 40 40 40 16 45 50 40 ...
$ native.country: Factor w/ 41 levels "Cambodia","Canada",..: 39 39 39 39 5 39 23 39 39 39 ...
$ X : Factor w/ 2 levels "<=50K",">50K": 1 1 1 1 1 1 1 2 2 2 ...
- attr(*, "na.action")= 'omit' Named int 15 28 39 52 62 70 78 94 107 129 ...
..- attr(*, "names")= chr "15" "28" "39" "52" ...
censusData$workclass<-as.character(censusData$workclass)
censusData$occupation<-as.character(censusData$occupation)
censusData$native.country<-as.character(censusData$native.country)
censusData$education<-as.character(censusData$education)
censusData$marital.status<-as.character(censusData$marital.status)
censusData$relationship<-as.character(censusData$relationship)
censusData$race<-as.character(censusData$race)
censusData$sex<-as.character(censusData$sex)
censusData$X<-as.character(censusData$X)
#Now, let’s look at the new structure
str(censusData)
'data.frame': 30162 obs. of 15 variables:
$ age : int 39 50 38 53 28 37 49 52 31 42 ...
$ workclass : chr "State-gov" "Self-emp-not-inc" "Private" "Private" ...
$ fnlwgt : int 77516 83311 215646 234721 338409 284582 160187 209642 45781 159449 ...
$ education : chr "Bachelors" "Bachelors" "HS-grad" "11th" ...
$ education.num : int 13 13 9 7 13 14 5 9 14 13 ...
$ marital.status: chr "Never-married" "Married-civ-spouse" "Divorced" "Married-civ-spouse" ...
$ occupation : chr "Adm-clerical" "Exec-managerial" "Handlers-cleaners" "Handlers-cleaners" ...
$ relationship : chr "Not-in-family" "Husband" "Not-in-family" "Husband" ...
$ race : chr "White" "White" "White" "Black" ...
$ sex : chr "Male" "Male" "Male" "Male" ...
$ capital.gain : int 2174 0 0 0 0 0 0 0 14084 5178 ...
$ capital.loss : int 0 0 0 0 0 0 0 0 0 0 ...
$ hours.per.week: int 40 13 40 40 40 40 16 45 50 40 ...
$ native.country: chr "United-States" "United-States" "United-States" "United-States" ...
$ X : chr "<=50K" "<=50K" "<=50K" "<=50K" ...
- attr(*, "na.action")= 'omit' Named int 15 28 39 52 62 70 78 94 107 129 ...
..- attr(*, "names")= chr "15" "28" "39" "52" ...
##Let look at the missing value in the columns
table(is.na(censusData))
FALSE
452430
censusData[censusData==" ?"]<- NA
##Remove all the rows that contain NA values.
censusData <-na.omit(censusData)
##Remove all whitespaces from the columns.
install.packages("stringr")
Error in install.packages : Updating loaded packages
install.packages("dplyr")
Error in install.packages : Updating loaded packages
library(stringr)
package 㤼㸱stringr㤼㸲 was built under R version 3.6.3
library(dplyr)
package 㤼㸱dplyr㤼㸲 was built under R version 3.6.3
Attaching package: 㤼㸱dplyr㤼㸲
The following objects are masked from 㤼㸱package:stats㤼㸲:
filter, lag
The following objects are masked from 㤼㸱package:base㤼㸲:
intersect, setdiff, setequal, union
censusData<-mutate_if(censusData, is.character, str_trim)
##In order to work with models, vizualize, we need to convert them into factors
censusData$workclass<-as.factor(censusData$workclass)
censusData$occupation<-as.factor(censusData$occupation)
censusData$native.country<-as.factor(censusData$native.country)
censusData$education<-as.factor(censusData$education)
censusData$marital.status<-as.factor(censusData$marital.status)
censusData$relationship<-as.factor(censusData$relationship)
censusData$race<-as.factor(censusData$race)
censusData$sex<-as.factor(censusData$sex)
censusData$X<-as.factor(censusData$X)
str(censusData)
'data.frame': 30162 obs. of 15 variables:
$ age : int 39 50 38 53 28 37 49 52 31 42 ...
$ workclass : Factor w/ 7 levels "Federal-gov",..: 6 5 3 3 3 3 3 5 3 3 ...
$ fnlwgt : int 77516 83311 215646 234721 338409 284582 160187 209642 45781 159449 ...
$ education : Factor w/ 16 levels "10th","11th",..: 10 10 12 2 10 13 7 12 13 10 ...
$ education.num : int 13 13 9 7 13 14 5 9 14 13 ...
$ marital.status: Factor w/ 7 levels "Divorced","Married-AF-spouse",..: 5 3 1 3 3 3 4 3 5 3 ...
$ occupation : Factor w/ 14 levels "Adm-clerical",..: 1 4 6 6 10 4 8 4 10 4 ...
$ relationship : Factor w/ 6 levels "Husband","Not-in-family",..: 2 1 2 1 6 6 2 1 2 1 ...
$ race : Factor w/ 5 levels "Amer-Indian-Eskimo",..: 5 5 5 3 3 5 3 5 5 5 ...
$ sex : Factor w/ 2 levels "Female","Male": 2 2 2 2 1 1 1 2 1 2 ...
$ capital.gain : int 2174 0 0 0 0 0 0 0 14084 5178 ...
$ capital.loss : int 0 0 0 0 0 0 0 0 0 0 ...
$ hours.per.week: int 40 13 40 40 40 40 16 45 50 40 ...
$ native.country: Factor w/ 41 levels "Cambodia","Canada",..: 39 39 39 39 5 39 23 39 39 39 ...
$ X : Factor w/ 2 levels "<=50K",">50K": 1 1 1 1 1 1 1 2 2 2 ...
- attr(*, "na.action")= 'omit' Named int 15 28 39 52 62 70 78 94 107 129 ...
..- attr(*, "names")= chr "15" "28" "39" "52" ...
#let’s now plot
plot(censusData$X)
##Data Manipulation: In this phase I will perform data manipulation to analyze the data set using various functions from the dplyr package
summary(censusData)
age workclass fnlwgt education education.num marital.status
Min. :17.00 Federal-gov : 943 Min. : 13769 HS-grad :9840 Min. : 1.00 Divorced : 4214
1st Qu.:28.00 Local-gov : 2067 1st Qu.: 117627 Some-college:6678 1st Qu.: 9.00 Married-AF-spouse : 21
Median :37.00 Private :22286 Median : 178425 Bachelors :5044 Median :10.00 Married-civ-spouse :14065
Mean :38.44 Self-emp-inc : 1074 Mean : 189794 Masters :1627 Mean :10.12 Married-spouse-absent: 370
3rd Qu.:47.00 Self-emp-not-inc: 2499 3rd Qu.: 237629 Assoc-voc :1307 3rd Qu.:13.00 Never-married : 9726
Max. :90.00 State-gov : 1279 Max. :1484705 11th :1048 Max. :16.00 Separated : 939
Without-pay : 14 (Other) :4618 Widowed : 827
occupation relationship race sex capital.gain capital.loss
Prof-specialty :4038 Husband :12463 Amer-Indian-Eskimo: 286 Female: 9782 Min. : 0 Min. : 0.00
Craft-repair :4030 Not-in-family : 7726 Asian-Pac-Islander: 895 Male :20380 1st Qu.: 0 1st Qu.: 0.00
Exec-managerial:3992 Other-relative: 889 Black : 2817 Median : 0 Median : 0.00
Adm-clerical :3721 Own-child : 4466 Other : 231 Mean : 1092 Mean : 88.37
Sales :3584 Unmarried : 3212 White :25933 3rd Qu.: 0 3rd Qu.: 0.00
Other-service :3212 Wife : 1406 Max. :99999 Max. :4356.00
(Other) :7585
hours.per.week native.country X
Min. : 1.00 United-States:27504 <=50K:22654
1st Qu.:40.00 Mexico : 610 >50K : 7508
Median :40.00 Philippines : 188
Mean :40.93 Germany : 128
3rd Qu.:45.00 Puerto-Rico : 109
Max. :99.00 Canada : 107
(Other) : 1516
##Extract the “education” column and store it in “census_ed”
census_ed<-censusData$education
View(census_ed)
class(census_ed)
[1] "factor"
head(census_ed)
[1] Bachelors Bachelors HS-grad 11th Bachelors Masters
16 Levels: 10th 11th 12th 1st-4th 5th-6th 7th-8th 9th Assoc-acdm Assoc-voc Bachelors Doctorate HS-grad Masters Preschool ... Some-college
##Extract all the columns from “age” to “relationship” and store it in “census_seq”.
install.packages("dplyr")
Error in install.packages : Updating loaded packages
library(dplyr)
census_seq<-select(censusData,age:relationship)
census_seq
##Extract the column number “5”, “8”, “11” and store it in “census_col”
census_col<-censusData[,c(5,8,11)]
View(census_col)
head(census_col)
##Extract all the male employees who work in state-gov and store it in “male_gov”.
install.packages("dplyr")
WARNING: Rtools is required to build R packages but is not currently installed. Please download and install the appropriate version of Rtools before proceeding:
https://cran.rstudio.com/bin/windows/Rtools/
Installing package into 㤼㸱C:/Users/ADMIN/Documents/R/win-library/3.6㤼㸲
(as 㤼㸱lib㤼㸲 is unspecified)
trying URL 'https://cran.rstudio.com/bin/windows/contrib/3.6/dplyr_1.0.2.zip'
Content type 'application/zip' length 1527701 bytes (1.5 MB)
downloaded 1.5 MB
package ‘dplyr’ successfully unpacked and MD5 sums checked
The downloaded binary packages are in
C:\Users\ADMIN\AppData\Local\Temp\RtmpeEiq4m\downloaded_packages
library(dplyr)
package 㤼㸱dplyr㤼㸲 was built under R version 3.6.3
Attaching package: 㤼㸱dplyr㤼㸲
The following objects are masked from 㤼㸱package:stats㤼㸲:
filter, lag
The following objects are masked from 㤼㸱package:base㤼㸲:
intersect, setdiff, setequal, union
male_gov<-censusData%>% filter(sex == "Male" & workclass=="State-gov")
View(male_gov)
##Extract all the 39 year olds who either have a bachelor’s degree # or who are native of United States and store the result in “census_us”
table(censusData$native.country)
Cambodia Canada China Columbia Cuba
18 107 68 56 92
Dominican-Republic Ecuador El-Salvador England France
67 27 100 86 27
Germany Greece Guatemala Haiti Holand-Netherlands
128 29 63 42 1
Honduras Hong Hungary India Iran
12 19 13 100 42
Ireland Italy Jamaica Japan Laos
24 68 80 59 17
Mexico Nicaragua Outlying-US(Guam-USVI-etc) Peru Philippines
610 33 14 30 188
Poland Portugal Puerto-Rico Scotland South
56 34 109 11 71
Taiwan Thailand Trinadad&Tobago United-States Vietnam
42 17 18 27504 64
Yugoslavia
16
table(censusData$education)
10th 11th 12th 1st-4th 5th-6th 7th-8th 9th Assoc-acdm Assoc-voc Bachelors
820 1048 377 151 288 557 455 1008 1307 5044
Doctorate HS-grad Masters Preschool Prof-school Some-college
375 9840 1627 45 542 6678
census_us<-censusData%>%filter(age==39&(education=="Bachelors"|native.country=="United-States"))
View(census_us)
##Extract 200 random rows from the “census” data frame and store it in “census_200”.
census_200<-sample_n(censusData,200)
View(census_200)
##Get the count of different levels of the “workclass” column.
install.packages("plyr")
WARNING: Rtools is required to build R packages but is not currently installed. Please download and install the appropriate version of Rtools before proceeding:
https://cran.rstudio.com/bin/windows/Rtools/
Installing package into 㤼㸱C:/Users/ADMIN/Documents/R/win-library/3.6㤼㸲
(as 㤼㸱lib㤼㸲 is unspecified)
trying URL 'https://cran.rstudio.com/bin/windows/contrib/3.6/plyr_1.8.6.zip'
Content type 'application/zip' length 1314846 bytes (1.3 MB)
downloaded 1.3 MB
package ‘plyr’ successfully unpacked and MD5 sums checked
The downloaded binary packages are in
C:\Users\ADMIN\AppData\Local\Temp\RtmpeEiq4m\downloaded_packages
library(plyr)
package 㤼㸱plyr㤼㸲 was built under R version 3.6.3---------------------------------------------------------------------------------------------------------------------------------------
You have loaded plyr after dplyr - this is likely to cause problems.
If you need functions from both plyr and dplyr, please load plyr first, then dplyr:
library(plyr); library(dplyr)
---------------------------------------------------------------------------------------------------------------------------------------
Attaching package: 㤼㸱plyr㤼㸲
The following objects are masked from 㤼㸱package:dplyr㤼㸲:
arrange, count, desc, failwith, id, mutate, rename, summarise, summarize
countWcls<-count(censusData$workclass)
countWcls
table(censusData$workclass)
Federal-gov Local-gov Private Self-emp-inc Self-emp-not-inc State-gov Without-pay
943 2067 22286 1074 2499 1279 14
##Calculate the mean of “capital.gain” column grouped according to “workclass”.
tapply(censusData$capital.gain,censusData$workclass,mean)
Federal-gov Local-gov Private Self-emp-inc Self-emp-not-inc State-gov Without-pay
832.3213 829.2303 879.8582 4810.7467 1913.1345 684.3065 487.8571
#Data Visualization:
install.packages("ggplot2")
WARNING: Rtools is required to build R packages but is not currently installed. Please download and install the appropriate version of Rtools before proceeding:
https://cran.rstudio.com/bin/windows/Rtools/
Installing package into 㤼㸱C:/Users/ADMIN/Documents/R/win-library/3.6㤼㸲
(as 㤼㸱lib㤼㸲 is unspecified)
trying URL 'https://cran.rstudio.com/bin/windows/contrib/3.6/ggplot2_3.3.2.zip'
Content type 'application/zip' length 4068917 bytes (3.9 MB)
downloaded 3.9 MB
package ‘ggplot2’ successfully unpacked and MD5 sums checked
The downloaded binary packages are in
C:\Users\ADMIN\AppData\Local\Temp\RtmpeEiq4m\downloaded_packages
library(ggplot2)
package 㤼㸱ggplot2㤼㸲 was built under R version 3.6.3
##Build a bar-plot for the “relationship” column and fill the bars according to the “race” # column.
ggplot(censusData,aes(x=relationship,fill=race))+
geom_bar()
##Set x-axis label to ‘Categories of Relationships’ ##Set y-axis label to ‘Count of Categories’
ggplot(censusData,aes(x=relationship,fill=race))+
geom_bar()+
labs(x="Categories of Relationships",y="Count of Categories")
##Fill the bars according to “sex”
ggplot(censusData,aes(x=relationship,fill=sex))+
geom_bar()+
labs(x="Categories of Relationships",y="Count of Categories")
##Set the position of the bars to “dodge”
ggplot(censusData,aes(x=relationship,fill=sex))+
geom_bar(position = "dodge")+
labs(x="Categories of Relationships",y="Count of Categories")
##Set the title of plot to be ’Distribution of Relationships by Sex"
ggplot(censusData,aes(x=relationship,fill=sex))+
geom_bar(position = "dodge")+
labs(x="Categories of Relationships",y="Count of Categories",title = "Distribution of Relationships by Sex")
##Build a Histogram for the “age” column with number of bins equal to 50.
ggplot(censusData,aes(x=age))+geom_histogram(bins = 50)
table(censusData$age)
17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50
328 447 594 629 621 674 824 752 799 745 789 808 774 813 851 789 837 836 828 852 828 791 786 765 769 741 743 704 706 711 683 523 555 575
51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84
571 455 448 394 386 343 337 344 332 276 259 213 186 173 136 110 111 90 80 64 54 40 49 38 34 29 20 14 15 16 13 7 5 8
85 86 88 90
3 1 3 35
##Fill the bars of the histogram according to yearly income column i.e., “X”
ggplot(censusData,aes(x=age,fill=X))+geom_histogram(bins = 90)
##Set the title of the plot to “Distribution of Age”.
ggplot(censusData,aes(x=age,fill=X))+geom_histogram(bins = 90)+
labs(title = "Distribution of Age")
##Set the legend title to “Yearly income”.
ggplot(censusData,aes(x=age,fill=X))+geom_histogram(bins = 90)+
labs(title = "Distribution of Age",fill='Yearly income')
##Set the theme of the plot to black and white.
ggplot(censusData,aes(x=age))+geom_histogram(bins =90)+
labs(title = "Distribution of Age")+theme()
##Build a scatter-plot between “capital.gain” and “hours.per.week”. ## Map “capital.gain” on the x- axis and “hours.per.week” on the y-axis.
ggplot(censusData,aes(x=capital.gain,y=hours.per.week))+geom_point()
##Set the transparency of the points to 40% and size as 2.
ggplot(censusData,aes(x=capital.gain,y=hours.per.week))+
geom_point(alpha=0.6,size=2)
##Set the color of the points according to the “X” (yearly income) column.
ggplot(censusData,aes(x=capital.gain,y=hours.per.week,fill=X))+geom_point()
##Set the x-axis label to “Capital Gain”, y-axis label to “Hours per Week”, title # to “Capital Gain vs Hours per Week by Income”, and legend label to “Yearly Income”.
ggplot(censusData,aes(x=capital.gain,y=hours.per.week,fill=X))+
geom_point(alpha=0.6,size=2)+
labs(x="Capital Gain",y="Hours per Week",
title = "Capital Gain vs Hours per Week by Income", fill="Yearly Income")
install.packages("plotly")
WARNING: Rtools is required to build R packages but is not currently installed. Please download and install the appropriate version of Rtools before proceeding:
https://cran.rstudio.com/bin/windows/Rtools/
Installing package into 㤼㸱C:/Users/ADMIN/Documents/R/win-library/3.6㤼㸲
(as 㤼㸱lib㤼㸲 is unspecified)
trying URL 'https://cran.rstudio.com/bin/windows/contrib/3.6/plotly_4.9.2.1.zip'
Content type 'application/zip' length 3044949 bytes (2.9 MB)
downloaded 2.9 MB
package ‘plotly’ successfully unpacked and MD5 sums checked
The downloaded binary packages are in
C:\Users\ADMIN\AppData\Local\Temp\RtmpeEiq4m\downloaded_packages
library(plotly)
package 㤼㸱plotly㤼㸲 was built under R version 3.6.3
Attaching package: 㤼㸱plotly㤼㸲
The following object is masked from 㤼㸱package:ggplot2㤼㸲:
last_plot
The following objects are masked from 㤼㸱package:plyr㤼㸲:
arrange, mutate, rename, summarise
The following object is masked from 㤼㸱package:stats㤼㸲:
filter
The following object is masked from 㤼㸱package:graphics㤼㸲:
layout
plot_ly(data=censusData, x = ~capital.gain, y = ~hours.per.week, color = ~X, type='scatter')
No scatter mode specifed:
Setting the mode to markers
Read more about this attribute -> https://plot.ly/r/reference/#scatter-mode
`arrange_()` is deprecated as of dplyr 0.7.0.
Please use `arrange()` instead.
See vignette('programming') for more help
[90mThis warning is displayed once every 8 hours.[39m
[90mCall `lifecycle::last_warnings()` to see where this warning was generated.[39mminimal value for n is 3, returning requested palette with 3 different levels
minimal value for n is 3, returning requested palette with 3 different levels
No scatter mode specifed:
Setting the mode to markers
Read more about this attribute -> https://plot.ly/r/reference/#scatter-mode
minimal value for n is 3, returning requested palette with 3 different levels
minimal value for n is 3, returning requested palette with 3 different levels
##Build a box-plot between “education” and “age” column.Map “education” on the x-axis and ## “age” on the y-axis.
ggplot(censusData,aes(x=education,y=age))+geom_boxplot()
ggplot(censusData,aes(x=education,y=age,fill=sex))+geom_boxplot()
ggplot(censusData,aes(x=education,y=age,fill=sex))+
geom_boxplot()+labs(title = "Box-Plot of age by Education and Sex")
#Prediction guilding a Linear Regression Model: ## Build a simple linear regression model ##Divide the dataset into training and test sets in 70:30 ratio.
set.seed(98)
install.packages("caTools")
WARNING: Rtools is required to build R packages but is not currently installed. Please download and install the appropriate version of Rtools before proceeding:
https://cran.rstudio.com/bin/windows/Rtools/
Installing package into 㤼㸱C:/Users/ADMIN/Documents/R/win-library/3.6㤼㸲
(as 㤼㸱lib㤼㸲 is unspecified)
trying URL 'https://cran.rstudio.com/bin/windows/contrib/3.6/caTools_1.18.0.zip'
Content type 'application/zip' length 330351 bytes (322 KB)
downloaded 322 KB
package ‘caTools’ successfully unpacked and MD5 sums checked
The downloaded binary packages are in
C:\Users\ADMIN\AppData\Local\Temp\RtmpY9qMhV\downloaded_packages
library("caTools")
package 㤼㸱caTools㤼㸲 was built under R version 3.6.3
split_data<-sample.split(censusData$hours.per.week,SplitRatio = 0.70)
View(split_data)
censusTrain<-subset(censusData,split_data==T)
censusTest<-subset(censusData,split_data==F)
View(censusTrain)
View(censusTest)
nrow(censusTrain)
[1] 21113
nrow(censusTest)
[1] 9049
View(split_data)
##“hours.per.week” and independent variable is “education.num”. ## dependent~independ
View(censusData[c('hours.per.week','education.num')])
LR_model<-lm(hours.per.week~education.num,data=censusTrain)
summary(LR_model)
Call:
lm(formula = hours.per.week ~ education.num, data = censusTrain)
Residuals:
Min 1Q Median 3Q Max
-44.064 -2.954 -0.140 4.157 62.378
Coefficients:
Estimate Std. Error t value Pr(>|t|)
(Intercept) 33.80781 0.33445 101.08 <2e-16 ***
education.num 0.70353 0.03204 21.96 <2e-16 ***
---
Signif. codes: 0 ‘***’ 0.001 ‘**’ 0.01 ‘*’ 0.05 ‘.’ 0.1 ‘ ’ 1
Residual standard error: 11.85 on 21111 degrees of freedom
Multiple R-squared: 0.02233, Adjusted R-squared: 0.02229
F-statistic: 482.3 on 1 and 21111 DF, p-value: < 2.2e-16
##Find the root-mean-square error (RMSE).
censusP<-predict(LR_model,newdata=censusTest)
head(censusP)
9 15 16 20 22 24
43.65722 36.62193 40.13957 45.06428 37.32546 40.13957
View(censusP)
censusD<-cbind(Actual=censusTest$hours.per.week,Predicted=censusP)
View(censusD)
class(censusD)
[1] "matrix"
censusD<-as.data.frame(censusD)
Error<-censusD$Actual-censusD$Predicted
View(Error)
Data<-cbind(censusD,Error)
View(Data)
sqrt(mean((Data$Error)^2))
[1] 11.82446
library(caret)
package 㤼㸱caret㤼㸲 was built under R version 3.6.3Loading required package: lattice
Loading required package: ggplot2
package 㤼㸱ggplot2㤼㸲 was built under R version 3.6.3
RMSE(censusP, censusTest$hours.per.week)
[1] 11.82446
#Prediction building a Logistic Regression
install.packages("caTools")
Error in install.packages : Updating loaded packages
library("caTools")
split_data1<-sample.split(censusData$X,SplitRatio = 0.65)
censusTrain1<-subset(censusData,split_data1==T)
censusTest1<-subset(censusData,split_data1==F)
nrow(censusTrain1)
[1] 19605
nrow(censusTest1)
[1] 10557
log_mod<-glm(X~occupation,data=censusTrain1,family = "binomial")
summary(log_mod)
Call:
glm(formula = X ~ occupation, family = "binomial", data = censusTrain1)
Deviance Residuals:
Min 1Q Median 3Q Max
-1.1653 -0.7900 -0.5181 -0.1358 3.0631
Coefficients:
Estimate Std. Error z value Pr(>|z|)
(Intercept) -1.94027 0.06123 -31.686 < 2e-16 ***
occupationArmed-Forces -10.62579 132.57607 -0.080 0.93612
occupationCraft-repair 0.70014 0.07699 9.094 < 2e-16 ***
occupationExec-managerial 1.91171 0.07276 26.275 < 2e-16 ***
occupationFarming-fishing -0.03095 0.13548 -0.228 0.81928
occupationHandlers-cleaners -0.60676 0.14262 -4.255 2.09e-05 ***
occupationMachine-op-inspct -0.07784 0.10753 -0.724 0.46912
occupationOther-service -1.18778 0.12494 -9.506 < 2e-16 ***
occupationPriv-house-serv -2.74186 1.00648 -2.724 0.00645 **
occupationProf-specialty 1.72773 0.07289 23.704 < 2e-16 ***
occupationProtective-serv 1.21252 0.11914 10.178 < 2e-16 ***
occupationSales 0.93585 0.07709 12.140 < 2e-16 ***
occupationTech-support 1.21450 0.10640 11.415 < 2e-16 ***
occupationTransport-moving 0.59881 0.09863 6.071 1.27e-09 ***
---
Signif. codes: 0 ‘***’ 0.001 ‘**’ 0.01 ‘*’ 0.05 ‘.’ 0.1 ‘ ’ 1
(Dispersion parameter for binomial family taken to be 1)
Null deviance: 22002 on 19604 degrees of freedom
Residual deviance: 19420 on 19591 degrees of freedom
AIC: 19448
Number of Fisher Scoring iterations: 11
pred_val<-predict(log_mod,newdata =censusTest1,type = "response")#probability
head(pred_val)
2 5 8 9 12 15
0.4928599 0.4470634 0.4928599 0.4470634 0.4470634 0.2072692
range(pred_val)
[1] 3.488403e-06 4.928599e-01
install.packages("ROCR")
Error in install.packages : Updating loaded packages
library(ROCR)
predict_log_roc<-prediction(pred_val,censusTest1$X)
predict_log_roc
A prediction instance
with 10557 data points
acc<-performance(predict_log_roc,"acc")
plot(acc)
table(censusData$X)
<=50K >50K
22654 7508
lm.pred<-ifelse(pred_val>0.47,">50K","<=50K")
Warning message:
package ‘ROCR’ was built under R version 3.6.3
lm.pred
2 5 8 9 12 15 18 20 21 22 23 24 25 26 27 29 30
">50K" "<=50K" ">50K" "<=50K" "<=50K" "<=50K" "<=50K" "<=50K" "<=50K" "<=50K" "<=50K" "<=50K" "<=50K" "<=50K" ">50K" "<=50K" "<=50K"
32 35 37 46 47 51 52 54 55 56 61 62 63 64 67 69 71
"<=50K" "<=50K" "<=50K" "<=50K" "<=50K" ">50K" "<=50K" "<=50K" "<=50K" "<=50K" "<=50K" "<=50K" "<=50K" ">50K" "<=50K" "<=50K" "<=50K"
76 77 79 83 88 89 90 92 93 95 99 101 103 106 108 109 111
"<=50K" "<=50K" "<=50K" "<=50K" "<=50K" "<=50K" "<=50K" "<=50K" ">50K" "<=50K" "<=50K" "<=50K" "<=50K" "<=50K" "<=50K" "<=50K" "<=50K"
119 121 122 124 125 126 127 131 133 141 145 146 147 152 153 154 157
"<=50K" "<=50K" "<=50K" "<=50K" "<=50K" "<=50K" "<=50K" "<=50K" "<=50K" "<=50K" "<=50K" "<=50K" "<=50K" "<=50K" "<=50K" ">50K" "<=50K"
159 160 168 169 170 174 178 184 185 186 188 193 194 200 202 205 207
"<=50K" "<=50K" ">50K" "<=50K" "<=50K" ">50K" "<=50K" ">50K" "<=50K" "<=50K" ">50K" "<=50K" "<=50K" "<=50K" "<=50K" "<=50K" "<=50K"
211 214 216 219 220 224 229 233 237 242 243 251 252 253 254 262 270
"<=50K" "<=50K" "<=50K" "<=50K" "<=50K" "<=50K" "<=50K" "<=50K" "<=50K" "<=50K" "<=50K" "<=50K" "<=50K" "<=50K" "<=50K" "<=50K" "<=50K"
273 274 275 276 282 283 288 289 293 300 301 303 305 306 315 320 322
"<=50K" "<=50K" "<=50K" "<=50K" "<=50K" "<=50K" "<=50K" "<=50K" "<=50K" "<=50K" ">50K" "<=50K" "<=50K" "<=50K" "<=50K" "<=50K" "<=50K"
323 324 327 329 331 340 341 344 345 346 348 349 353 354 355 360 362
"<=50K" "<=50K" ">50K" "<=50K" "<=50K" "<=50K" "<=50K" "<=50K" "<=50K" "<=50K" "<=50K" "<=50K" "<=50K" "<=50K" "<=50K" "<=50K" "<=50K"
366 376 378 380 381 387 390 391 395 400 404 405 406 407 416 417 418
"<=50K" "<=50K" "<=50K" ">50K" "<=50K" "<=50K" "<=50K" "<=50K" "<=50K" "<=50K" "<=50K" "<=50K" "<=50K" "<=50K" "<=50K" "<=50K" "<=50K"
423 424 425 427 431 436 437 438 440 461 462 464 470 474 478 479 482
">50K" "<=50K" "<=50K" "<=50K" "<=50K" "<=50K" "<=50K" "<=50K" "<=50K" "<=50K" "<=50K" "<=50K" "<=50K" "<=50K" ">50K" ">50K" "<=50K"
485 492 494 495 500 504 507 512 522 523 527 530 534 536 539 541 542
"<=50K" "<=50K" "<=50K" "<=50K" "<=50K" "<=50K" ">50K" "<=50K" ">50K" "<=50K" "<=50K" "<=50K" "<=50K" "<=50K" "<=50K" "<=50K" "<=50K"
544 545 549 552 553 556 559 560 561 562 570 577 578 579 584 586 593
"<=50K" "<=50K" "<=50K" "<=50K" "<=50K" "<=50K" ">50K" "<=50K" "<=50K" "<=50K" "<=50K" "<=50K" "<=50K" ">50K" "<=50K" "<=50K" "<=50K"
595 596 597 598 599 600 605 606 608 611 615 616 617 621 622 623 627
"<=50K" "<=50K" "<=50K" "<=50K" "<=50K" "<=50K" "<=50K" "<=50K" "<=50K" "<=50K" "<=50K" "<=50K" "<=50K" "<=50K" "<=50K" "<=50K" "<=50K"
631 637 639 641 642 646 648 649 650 652 654 662 663 664 667 673 674
"<=50K" "<=50K" "<=50K" "<=50K" "<=50K" "<=50K" "<=50K" "<=50K" ">50K" "<=50K" "<=50K" "<=50K" ">50K" "<=50K" ">50K" "<=50K" "<=50K"
678 680 683 684 685 689 690 693 694 698 701 704 708 709 710 714 715
"<=50K" "<=50K" ">50K" "<=50K" "<=50K" "<=50K" "<=50K" "<=50K" "<=50K" "<=50K" "<=50K" ">50K" "<=50K" "<=50K" "<=50K" "<=50K" "<=50K"
721 724 731 735 737 741 742 743 745 747 749 754 755 757 759 760 762
">50K" "<=50K" "<=50K" ">50K" "<=50K" "<=50K" "<=50K" "<=50K" "<=50K" "<=50K" "<=50K" "<=50K" "<=50K" "<=50K" "<=50K" "<=50K" "<=50K"
764 769 772 774 784 786 787 791 795 799 808 809 812 814 821 822 824
"<=50K" "<=50K" "<=50K" "<=50K" "<=50K" "<=50K" "<=50K" ">50K" ">50K" "<=50K" "<=50K" "<=50K" "<=50K" "<=50K" "<=50K" "<=50K" "<=50K"
827 828 829 830 836 844 850 852 854 855 857 865 866 867 868 869 870
">50K" "<=50K" "<=50K" "<=50K" "<=50K" "<=50K" "<=50K" "<=50K" ">50K" "<=50K" "<=50K" "<=50K" "<=50K" "<=50K" ">50K" "<=50K" "<=50K"
874 876 880 881 882 883 885 887 888 893 895 898 906 908 909 913 914
"<=50K" "<=50K" ">50K" "<=50K" "<=50K" ">50K" "<=50K" "<=50K" ">50K" ">50K" "<=50K" "<=50K" "<=50K" "<=50K" "<=50K" "<=50K" "<=50K"
915 916 919 921 922 924 926 929 943 944 945 948 951 953 954 959 960
"<=50K" "<=50K" "<=50K" "<=50K" ">50K" "<=50K" "<=50K" "<=50K" "<=50K" "<=50K" "<=50K" ">50K" "<=50K" "<=50K" "<=50K" "<=50K" ">50K"
962 963 967 968 970 976 977 979 980 981 983 985 986 987 990 992 994
"<=50K" "<=50K" "<=50K" "<=50K" "<=50K" ">50K" "<=50K" "<=50K" "<=50K" "<=50K" "<=50K" "<=50K" "<=50K" "<=50K" "<=50K" "<=50K" "<=50K"
995 1002 1003 1004 1011 1012 1019 1021 1024 1026 1027 1030 1034 1035 1036 1039 1041
"<=50K" "<=50K" ">50K" ">50K" "<=50K" "<=50K" "<=50K" "<=50K" "<=50K" "<=50K" "<=50K" "<=50K" "<=50K" ">50K" "<=50K" "<=50K" "<=50K"
1045 1048 1049 1050 1051 1053 1055 1060 1061 1066 1070 1071 1072 1073 1076 1078 1079
"<=50K" "<=50K" "<=50K" "<=50K" "<=50K" "<=50K" "<=50K" ">50K" "<=50K" ">50K" "<=50K" "<=50K" "<=50K" "<=50K" "<=50K" "<=50K" "<=50K"
1084 1085 1093 1096 1103 1116 1119 1123 1125 1126 1128 1131 1133 1134 1137 1139 1140
"<=50K" "<=50K" "<=50K" "<=50K" "<=50K" "<=50K" ">50K" "<=50K" "<=50K" "<=50K" "<=50K" "<=50K" "<=50K" "<=50K" "<=50K" "<=50K" "<=50K"
1142 1143 1145 1148 1150 1154 1155 1165 1167 1171 1173 1177 1180 1181 1183 1184 1190
">50K" ">50K" "<=50K" "<=50K" ">50K" "<=50K" "<=50K" "<=50K" "<=50K" "<=50K" "<=50K" "<=50K" "<=50K" "<=50K" "<=50K" "<=50K" "<=50K"
1193 1198 1199 1200 1201 1203 1208 1212 1216 1221 1225 1231 1232 1234 1243 1248 1249
"<=50K" "<=50K" "<=50K" ">50K" "<=50K" "<=50K" "<=50K" "<=50K" "<=50K" "<=50K" "<=50K" ">50K" "<=50K" "<=50K" "<=50K" "<=50K" "<=50K"
1250 1252 1255 1258 1260 1262 1264 1267 1271 1277 1279 1281 1284 1285 1288 1290 1294
"<=50K" "<=50K" "<=50K" "<=50K" "<=50K" "<=50K" "<=50K" ">50K" "<=50K" "<=50K" "<=50K" "<=50K" "<=50K" "<=50K" "<=50K" "<=50K" "<=50K"
1295 1303 1305 1317 1318 1323 1324 1325 1326 1329 1330 1331 1332 1333 1335 1337 1340
"<=50K" "<=50K" "<=50K" "<=50K" "<=50K" "<=50K" ">50K" "<=50K" "<=50K" ">50K" ">50K" ">50K" "<=50K" "<=50K" "<=50K" "<=50K" ">50K"
1341 1342 1343 1354 1357 1360 1364 1365 1366 1367 1373 1376 1377 1379 1382 1388 1392
"<=50K" "<=50K" "<=50K" "<=50K" "<=50K" "<=50K" "<=50K" "<=50K" "<=50K" "<=50K" "<=50K" "<=50K" "<=50K" "<=50K" "<=50K" "<=50K" "<=50K"
1393 1394 1396 1400 1402 1406 1412 1415 1416 1419 1421 1424 1425 1427 1434 1435 1439
">50K" "<=50K" "<=50K" "<=50K" "<=50K" "<=50K" "<=50K" ">50K" "<=50K" "<=50K" "<=50K" "<=50K" "<=50K" "<=50K" "<=50K" "<=50K" "<=50K"
1440 1442 1443 1448 1449 1452 1453 1455 1458 1468 1471 1473 1474 1481 1483 1484 1485
"<=50K" "<=50K" "<=50K" "<=50K" "<=50K" "<=50K" "<=50K" "<=50K" ">50K" "<=50K" "<=50K" "<=50K" "<=50K" "<=50K" "<=50K" ">50K" "<=50K"
1492 1494 1495 1501 1505 1506 1507 1511 1517 1519 1520 1524 1525 1526 1528 1536 1538
"<=50K" "<=50K" "<=50K" ">50K" "<=50K" "<=50K" "<=50K" "<=50K" "<=50K" "<=50K" "<=50K" "<=50K" "<=50K" ">50K" "<=50K" "<=50K" "<=50K"
1540 1541 1542 1546 1547 1548 1549 1551 1563 1566 1567 1571 1572 1573 1574 1576 1578
"<=50K" "<=50K" "<=50K" "<=50K" "<=50K" ">50K" "<=50K" "<=50K" "<=50K" "<=50K" "<=50K" "<=50K" "<=50K" "<=50K" "<=50K" "<=50K" "<=50K"
1579 1581 1582 1584 1585 1586 1587 1591 1595 1597 1598 1599 1603 1604 1609 1612 1614
"<=50K" "<=50K" "<=50K" "<=50K" "<=50K" "<=50K" "<=50K" ">50K" ">50K" "<=50K" "<=50K" "<=50K" "<=50K" ">50K" "<=50K" "<=50K" ">50K"
1615 1623 1624 1625 1627 1631 1632 1639 1640 1641 1642 1646 1649 1650 1651 1657 1660
">50K" "<=50K" "<=50K" "<=50K" "<=50K" "<=50K" "<=50K" "<=50K" "<=50K" "<=50K" ">50K" "<=50K" "<=50K" "<=50K" "<=50K" "<=50K" "<=50K"
1662 1667 1668 1669 1671 1673 1677 1678 1679 1681 1685 1688 1689 1692 1693 1697 1702
"<=50K" "<=50K" "<=50K" "<=50K" "<=50K" "<=50K" "<=50K" ">50K" "<=50K" "<=50K" "<=50K" "<=50K" "<=50K" "<=50K" "<=50K" "<=50K" "<=50K"
1705 1708 1709 1714 1717 1718 1719 1720 1721 1723 1733 1734 1735 1739 1741 1742 1744
"<=50K" "<=50K" "<=50K" "<=50K" "<=50K" "<=50K" "<=50K" "<=50K" "<=50K" "<=50K" ">50K" "<=50K" "<=50K" "<=50K" "<=50K" "<=50K" ">50K"
1745 1748 1749 1751 1753 1755 1757 1758 1761 1764 1765 1766 1781 1785 1786 1792 1793
"<=50K" "<=50K" "<=50K" "<=50K" "<=50K" "<=50K" "<=50K" "<=50K" "<=50K" "<=50K" "<=50K" "<=50K" ">50K" "<=50K" "<=50K" "<=50K" "<=50K"
1797 1799 1802 1803 1806 1814 1817 1818 1820 1822 1823 1825 1832 1835 1838 1839 1840
">50K" "<=50K" "<=50K" "<=50K" "<=50K" "<=50K" "<=50K" "<=50K" "<=50K" "<=50K" "<=50K" "<=50K" "<=50K" "<=50K" "<=50K" "<=50K" "<=50K"
1845 1849 1853 1854 1857 1859 1861 1862 1864 1869 1874 1877 1883 1890 1891 1892 1898
"<=50K" "<=50K" ">50K" "<=50K" "<=50K" "<=50K" "<=50K" "<=50K" "<=50K" "<=50K" "<=50K" "<=50K" ">50K" "<=50K" "<=50K" "<=50K" "<=50K"
1899 1905 1907 1908 1912 1915 1916 1917 1921 1924 1928 1932 1933 1934 1941 1946 1948
"<=50K" "<=50K" ">50K" "<=50K" "<=50K" "<=50K" ">50K" "<=50K" ">50K" "<=50K" "<=50K" "<=50K" ">50K" "<=50K" "<=50K" "<=50K" "<=50K"
1949 1952 1954 1955 1958 1960 1963 1970 1971 1976 1977 1980 1982 1983 1987 1989 1992
"<=50K" "<=50K" "<=50K" "<=50K" "<=50K" "<=50K" "<=50K" "<=50K" "<=50K" "<=50K" "<=50K" "<=50K" "<=50K" "<=50K" ">50K" "<=50K" ">50K"
2002 2003 2006 2008 2013 2017 2018 2021 2022 2034 2035 2036 2039 2046 2048 2050 2055
"<=50K" "<=50K" "<=50K" "<=50K" "<=50K" "<=50K" "<=50K" "<=50K" "<=50K" "<=50K" "<=50K" "<=50K" ">50K" "<=50K" "<=50K" "<=50K" "<=50K"
2061 2065 2069 2073 2078 2080 2081 2083 2084 2090 2091 2094 2100 2102 2103 2104 2105
"<=50K" ">50K" "<=50K" "<=50K" "<=50K" ">50K" "<=50K" "<=50K" "<=50K" "<=50K" "<=50K" "<=50K" "<=50K" "<=50K" "<=50K" "<=50K" "<=50K"
2107 2114 2116 2120 2121 2131 2132 2135 2138 2140 2141 2144 2145 2148 2149 2150 2152
"<=50K" "<=50K" "<=50K" "<=50K" "<=50K" "<=50K" "<=50K" ">50K" "<=50K" "<=50K" "<=50K" "<=50K" "<=50K" "<=50K" "<=50K" "<=50K" "<=50K"
2154 2155 2156 2157 2160 2161 2163 2165 2166 2170 2171 2176 2177 2182 2189 2191 2192
"<=50K" "<=50K" ">50K" "<=50K" ">50K" "<=50K" "<=50K" ">50K" ">50K" "<=50K" "<=50K" "<=50K" "<=50K" "<=50K" ">50K" "<=50K" ">50K"
2197 2199 2200 2201 2203 2205 2207 2209 2212 2213 2215 2217 2218 2219 2224 2226 2229
"<=50K" "<=50K" "<=50K" "<=50K" "<=50K" "<=50K" "<=50K" "<=50K" "<=50K" ">50K" "<=50K" "<=50K" "<=50K" ">50K" "<=50K" "<=50K" "<=50K"
2230 2232 2234 2235 2238 2240 2243 2255 2257 2258 2263 2264 2268 2269 2272 2274 2275
"<=50K" "<=50K" "<=50K" "<=50K" ">50K" "<=50K" "<=50K" "<=50K" "<=50K" "<=50K" "<=50K" "<=50K" "<=50K" "<=50K" "<=50K" "<=50K" "<=50K"
2283 2286 2291 2293 2295 2296 2298 2301 2302 2304 2308 2311 2312 2319 2320 2324 2327
"<=50K" ">50K" "<=50K" "<=50K" ">50K" "<=50K" "<=50K" ">50K" "<=50K" "<=50K" "<=50K" "<=50K" "<=50K" "<=50K" "<=50K" ">50K" "<=50K"
2329 2331 2333 2335 2343 2349 2351 2353 2354 2358 2359 2360 2362 2368 2373 2374 2375
"<=50K" "<=50K" "<=50K" "<=50K" "<=50K" "<=50K" "<=50K" "<=50K" "<=50K" "<=50K" "<=50K" "<=50K" "<=50K" "<=50K" "<=50K" ">50K" "<=50K"
2381 2387 2388 2392 2393 2398 2399 2401 2406 2407 2415 2423 2424 2426 2427 2430 2433
"<=50K" ">50K" "<=50K" ">50K" "<=50K" "<=50K" "<=50K" "<=50K" "<=50K" "<=50K" "<=50K" "<=50K" "<=50K" "<=50K" "<=50K" ">50K" ">50K"
2435 2436 2437 2438 2441 2444 2445 2452 2453 2455 2456 2460 2461 2462 2463 2464 2468
"<=50K" "<=50K" "<=50K" "<=50K" "<=50K" "<=50K" "<=50K" "<=50K" "<=50K" "<=50K" "<=50K" "<=50K" ">50K" "<=50K" "<=50K" "<=50K" "<=50K"
2469 2472 2482 2483 2484 2485 2486 2488 2491 2492 2498 2499 2500 2502 2506 2508 2509
">50K" "<=50K" "<=50K" "<=50K" "<=50K" "<=50K" "<=50K" "<=50K" "<=50K" "<=50K" ">50K" "<=50K" "<=50K" "<=50K" "<=50K" ">50K" "<=50K"
2515 2517 2519 2520 2521 2525 2527 2529 2530 2533 2536 2544 2548 2554 2559 2560 2563
"<=50K" "<=50K" "<=50K" ">50K" "<=50K" ">50K" "<=50K" ">50K" "<=50K" "<=50K" "<=50K" "<=50K" "<=50K" "<=50K" "<=50K" "<=50K" "<=50K"
2565 2571 2577 2583 2586 2591 2593 2594 2601 2610 2615 2618 2621 2622 2624 2626 2628
"<=50K" "<=50K" "<=50K" "<=50K" "<=50K" "<=50K" "<=50K" ">50K" "<=50K" "<=50K" "<=50K" "<=50K" "<=50K" "<=50K" "<=50K" "<=50K" "<=50K"
2629 2630 2641 2644 2646 2650 2651 2652 2656 2658 2661 2665 2666 2667 2668 2669 2671
"<=50K" "<=50K" "<=50K" "<=50K" "<=50K" "<=50K" "<=50K" "<=50K" ">50K" ">50K" "<=50K" ">50K" "<=50K" "<=50K" "<=50K" "<=50K" "<=50K"
2672 2678 2681 2682 2688 2689 2691 2692 2697 2700 2704 2705 2711 2714 2717 2719 2721
"<=50K" "<=50K" ">50K" "<=50K" "<=50K" "<=50K" "<=50K" "<=50K" "<=50K" "<=50K" "<=50K" "<=50K" "<=50K" ">50K" "<=50K" "<=50K" "<=50K"
2723 2724 2726 2728 2730 2732 2733 2734 2735 2737 2738 2739 2740 2744 2746 2747 2752
"<=50K" "<=50K" "<=50K" "<=50K" "<=50K" "<=50K" "<=50K" "<=50K" "<=50K" ">50K" "<=50K" ">50K" "<=50K" "<=50K" ">50K" "<=50K" "<=50K"
2754 2759 2762 2763 2764 2767 2768 2769 2773 2775 2782 2787 2794 2795
"<=50K" "<=50K" "<=50K" "<=50K" "<=50K" "<=50K" "<=50K" "<=50K" "<=50K" "<=50K" "<=50K" "<=50K" "<=50K" "<=50K"
[ reached getOption("max.print") -- omitted 9557 entries ]
tab<-table(lm.pred,censusTest1$X)
tab
lm.pred <=50K >50K
<=50K 7188 1968
>50K 741 660
#TP FP #FN TN #TP TN -correctly predicted #FP FN - wrongly predicted
(7188+660)/(7188+660+1968+741)
[1] 0.743393
accuracy<-sum(diag(tab))/sum(tab)
accuracy
[1] 0.743393
install.packages("caTools")
WARNING: Rtools is required to build R packages but is not currently installed. Please download and install the appropriate version of Rtools before proceeding:
https://cran.rstudio.com/bin/windows/Rtools/
Installing package into 㤼㸱C:/Users/ADMIN/Documents/R/win-library/3.6㤼㸲
(as 㤼㸱lib㤼㸲 is unspecified)
trying URL 'https://cran.rstudio.com/bin/windows/contrib/3.6/caTools_1.18.0.zip'
Content type 'application/zip' length 330351 bytes (322 KB)
downloaded 322 KB
package ‘caTools’ successfully unpacked and MD5 sums checked
The downloaded binary packages are in
C:\Users\ADMIN\AppData\Local\Temp\Rtmpyaqneh\downloaded_packages
library("caTools")
package 㤼㸱caTools㤼㸲 was built under R version 3.6.3
roc<-performance(predict_log_roc,"tpr","fpr")
plot(roc)
performance(predict_log_roc, "auc")->auc
auc
A performance instance
'Area under the ROC curve'
auc<-auc@y.values[[1]]
auc
[1] 0.7224386
split_data1<- sample.split(censusData$X,SplitRatio = 0.80)
censusTrain2<-subset(censusData,split_data1==T)
censusTest2<-subset(censusData,split_data1==F)
log_mod2<-glm(X~age+workclass+education,data=censusTrain2,family = "binomial")
summary(log_mod2)
Call:
glm(formula = X ~ age + workclass + education, family = "binomial",
data = censusTrain2)
Deviance Residuals:
Min 1Q Median 3Q Max
-2.45996 -0.71258 -0.48153 -0.00092 2.89674
Coefficients:
Estimate Std. Error z value Pr(>|z|)
(Intercept) -4.045540 0.185147 -21.850 < 2e-16 ***
age 0.043331 0.001357 31.937 < 2e-16 ***
workclassLocal-gov -0.535418 0.102298 -5.234 1.66e-07 ***
workclassPrivate -0.350398 0.084826 -4.131 3.62e-05 ***
workclassSelf-emp-inc 0.691465 0.112371 6.153 7.58e-10 ***
workclassSelf-emp-not-inc -0.433828 0.099532 -4.359 1.31e-05 ***
workclassState-gov -0.645090 0.114088 -5.654 1.56e-08 ***
workclassWithout-pay -13.855967 244.920373 -0.057 0.9549
education11th -0.020626 0.211948 -0.097 0.9225
education12th 0.293045 0.269787 1.086 0.2774
education1st-4th -1.140963 0.534504 -2.135 0.0328 *
education5th-6th -0.533254 0.347338 -1.535 0.1247
education7th-8th -0.607732 0.252210 -2.410 0.0160 *
education9th -0.550901 0.288260 -1.911 0.0560 .
educationAssoc-acdm 1.563876 0.173313 9.023 < 2e-16 ***
educationAssoc-voc 1.631505 0.167947 9.714 < 2e-16 ***
educationBachelors 2.344929 0.154836 15.145 < 2e-16 ***
educationDoctorate 3.579137 0.204748 17.481 < 2e-16 ***
educationHS-grad 0.971806 0.153985 6.311 2.77e-10 ***
educationMasters 2.804354 0.161939 17.317 < 2e-16 ***
educationPreschool -12.098579 136.567513 -0.089 0.9294
educationProf-school 3.642187 0.191060 19.063 < 2e-16 ***
educationSome-college 1.303126 0.155197 8.397 < 2e-16 ***
---
Signif. codes: 0 ‘***’ 0.001 ‘**’ 0.01 ‘*’ 0.05 ‘.’ 0.1 ‘ ’ 1
(Dispersion parameter for binomial family taken to be 1)
Null deviance: 27079 on 24128 degrees of freedom
Residual deviance: 22478 on 24106 degrees of freedom
AIC: 22524
Number of Fisher Scoring iterations: 13
pred_val<-predict(log_mod2,newdata =censusTest2,type = "response")
head(pred_val)
3 5 9 10 12 19
0.1446068 0.3020126 0.4382387 0.4424821 0.2600398 0.5469194
##library(ROCR) ## TO decide Accuracy
library(ROCR)
predict_log_roc<-prediction(pred_val,censusTest2$X)
predict_log_roc
A prediction instance
with 6033 data points
acc<-performance(predict_log_roc,"acc")
plot(acc)
lm.pred<-ifelse(pred_val>0.45,">50K","<=50K")
lm.pred
3 5 9 10 12 19 27 31 33 34 38 45 49 57 66 72 92
"<=50K" "<=50K" "<=50K" "<=50K" "<=50K" ">50K" "<=50K" ">50K" "<=50K" "<=50K" "<=50K" ">50K" ">50K" "<=50K" "<=50K" "<=50K" "<=50K"
106 121 122 127 130 131 135 139 142 155 158 162 169 170 171 173 175
"<=50K" "<=50K" "<=50K" "<=50K" "<=50K" ">50K" ">50K" "<=50K" "<=50K" ">50K" "<=50K" "<=50K" "<=50K" "<=50K" "<=50K" "<=50K" "<=50K"
181 185 186 188 208 216 217 228 229 230 231 237 240 242 253 257 259
">50K" ">50K" "<=50K" ">50K" "<=50K" "<=50K" "<=50K" "<=50K" "<=50K" "<=50K" ">50K" "<=50K" "<=50K" "<=50K" "<=50K" "<=50K" "<=50K"
266 268 273 281 291 294 298 300 301 320 326 330 331 343 346 349 351
">50K" "<=50K" "<=50K" "<=50K" "<=50K" ">50K" "<=50K" "<=50K" "<=50K" "<=50K" "<=50K" "<=50K" "<=50K" "<=50K" "<=50K" "<=50K" "<=50K"
359 361 366 367 370 379 384 395 397 400 403 417 425 432 447 454 455
"<=50K" "<=50K" "<=50K" "<=50K" "<=50K" "<=50K" ">50K" "<=50K" ">50K" ">50K" "<=50K" "<=50K" "<=50K" "<=50K" "<=50K" "<=50K" "<=50K"
456 466 467 469 482 488 493 496 499 503 508 509 513 517 520 524 527
"<=50K" "<=50K" "<=50K" "<=50K" "<=50K" "<=50K" "<=50K" ">50K" "<=50K" "<=50K" "<=50K" "<=50K" "<=50K" ">50K" ">50K" "<=50K" "<=50K"
528 529 530 534 535 542 554 556 557 562 564 568 573 580 582 595 610
"<=50K" "<=50K" ">50K" "<=50K" "<=50K" "<=50K" "<=50K" "<=50K" ">50K" "<=50K" "<=50K" "<=50K" "<=50K" ">50K" "<=50K" "<=50K" ">50K"
614 623 631 633 634 636 637 640 647 649 655 668 672 676 685 691 694
"<=50K" "<=50K" "<=50K" "<=50K" ">50K" "<=50K" "<=50K" "<=50K" "<=50K" ">50K" "<=50K" "<=50K" "<=50K" "<=50K" "<=50K" "<=50K" "<=50K"
698 699 709 712 715 718 722 739 742 747 751 753 756 765 770 777 779
"<=50K" "<=50K" ">50K" "<=50K" "<=50K" ">50K" "<=50K" "<=50K" "<=50K" "<=50K" "<=50K" "<=50K" "<=50K" "<=50K" "<=50K" "<=50K" "<=50K"
783 785 788 792 793 796 797 798 807 808 810 811 813 814 818 821 848
"<=50K" "<=50K" "<=50K" "<=50K" "<=50K" "<=50K" "<=50K" "<=50K" "<=50K" "<=50K" "<=50K" "<=50K" "<=50K" "<=50K" "<=50K" "<=50K" "<=50K"
851 852 863 864 867 868 874 876 886 889 895 897 900 906 907 919 922
"<=50K" "<=50K" ">50K" "<=50K" "<=50K" "<=50K" "<=50K" "<=50K" "<=50K" "<=50K" "<=50K" "<=50K" "<=50K" "<=50K" "<=50K" ">50K" ">50K"
927 932 934 938 940 942 943 944 952 953 955 957 964 969 984 989 990
"<=50K" "<=50K" "<=50K" "<=50K" "<=50K" "<=50K" ">50K" "<=50K" "<=50K" "<=50K" "<=50K" ">50K" "<=50K" "<=50K" "<=50K" ">50K" "<=50K"
998 1001 1004 1009 1012 1016 1022 1029 1030 1032 1037 1043 1046 1062 1070 1074 1078
"<=50K" "<=50K" "<=50K" "<=50K" "<=50K" "<=50K" "<=50K" "<=50K" "<=50K" ">50K" "<=50K" "<=50K" "<=50K" "<=50K" "<=50K" "<=50K" "<=50K"
1083 1088 1092 1093 1101 1107 1111 1113 1117 1123 1126 1139 1142 1143 1146 1153 1167
"<=50K" "<=50K" "<=50K" "<=50K" "<=50K" ">50K" "<=50K" ">50K" "<=50K" "<=50K" "<=50K" "<=50K" "<=50K" "<=50K" ">50K" "<=50K" "<=50K"
1191 1197 1200 1208 1218 1222 1226 1227 1228 1247 1250 1251 1260 1261 1270 1274 1275
"<=50K" "<=50K" "<=50K" "<=50K" "<=50K" "<=50K" "<=50K" "<=50K" "<=50K" "<=50K" "<=50K" "<=50K" ">50K" ">50K" "<=50K" "<=50K" "<=50K"
1280 1283 1285 1288 1290 1299 1303 1304 1318 1319 1320 1321 1330 1336 1339 1340 1341
"<=50K" ">50K" "<=50K" "<=50K" "<=50K" "<=50K" "<=50K" "<=50K" "<=50K" "<=50K" "<=50K" ">50K" "<=50K" "<=50K" ">50K" "<=50K" "<=50K"
1346 1353 1362 1372 1381 1387 1394 1398 1400 1402 1405 1409 1419 1426 1429 1430 1439
"<=50K" ">50K" ">50K" "<=50K" "<=50K" "<=50K" ">50K" "<=50K" "<=50K" "<=50K" "<=50K" "<=50K" "<=50K" "<=50K" "<=50K" "<=50K" "<=50K"
1443 1445 1456 1457 1467 1472 1475 1485 1493 1494 1496 1497 1504 1507 1515 1516 1520
"<=50K" "<=50K" "<=50K" "<=50K" "<=50K" "<=50K" "<=50K" "<=50K" "<=50K" ">50K" "<=50K" "<=50K" ">50K" "<=50K" "<=50K" "<=50K" "<=50K"
1521 1533 1534 1540 1548 1556 1557 1561 1577 1582 1584 1585 1587 1604 1609 1620 1621
">50K" ">50K" "<=50K" "<=50K" "<=50K" "<=50K" "<=50K" "<=50K" ">50K" "<=50K" "<=50K" "<=50K" "<=50K" "<=50K" "<=50K" "<=50K" ">50K"
1623 1625 1634 1636 1637 1644 1645 1646 1648 1649 1656 1681 1682 1683 1685 1693 1696
"<=50K" "<=50K" "<=50K" "<=50K" "<=50K" "<=50K" "<=50K" "<=50K" ">50K" "<=50K" "<=50K" "<=50K" "<=50K" "<=50K" "<=50K" "<=50K" "<=50K"
1706 1713 1717 1720 1722 1723 1725 1733 1734 1740 1741 1744 1746 1752 1760 1770 1775
"<=50K" ">50K" "<=50K" "<=50K" "<=50K" "<=50K" "<=50K" "<=50K" "<=50K" "<=50K" "<=50K" "<=50K" "<=50K" "<=50K" ">50K" "<=50K" "<=50K"
1778 1789 1791 1792 1795 1798 1801 1816 1824 1825 1830 1832 1838 1842 1844 1846 1848
"<=50K" "<=50K" ">50K" ">50K" ">50K" "<=50K" "<=50K" "<=50K" "<=50K" ">50K" "<=50K" "<=50K" "<=50K" "<=50K" "<=50K" "<=50K" "<=50K"
1854 1869 1873 1876 1883 1889 1892 1896 1899 1904 1905 1909 1912 1913 1914 1923 1924
"<=50K" "<=50K" "<=50K" "<=50K" "<=50K" "<=50K" "<=50K" "<=50K" "<=50K" "<=50K" "<=50K" "<=50K" "<=50K" "<=50K" "<=50K" "<=50K" "<=50K"
1929 1930 1939 1947 1951 1952 1954 1956 1957 1960 1968 1970 1971 1979 1986 1988 1991
"<=50K" "<=50K" "<=50K" "<=50K" "<=50K" "<=50K" "<=50K" "<=50K" "<=50K" ">50K" ">50K" "<=50K" "<=50K" "<=50K" "<=50K" "<=50K" "<=50K"
2000 2019 2039 2043 2070 2073 2076 2078 2085 2087 2092 2093 2098 2104 2115 2123 2127
"<=50K" "<=50K" ">50K" "<=50K" ">50K" ">50K" "<=50K" "<=50K" "<=50K" "<=50K" "<=50K" "<=50K" "<=50K" ">50K" "<=50K" ">50K" ">50K"
2130 2139 2143 2156 2157 2160 2170 2173 2175 2176 2179 2182 2185 2193 2194 2199 2219
"<=50K" "<=50K" "<=50K" ">50K" "<=50K" ">50K" "<=50K" ">50K" "<=50K" "<=50K" ">50K" "<=50K" "<=50K" ">50K" "<=50K" "<=50K" "<=50K"
2227 2229 2240 2251 2256 2259 2264 2268 2275 2277 2286 2290 2293 2309 2319 2320 2325
"<=50K" "<=50K" "<=50K" "<=50K" "<=50K" "<=50K" "<=50K" "<=50K" "<=50K" ">50K" ">50K" "<=50K" "<=50K" "<=50K" "<=50K" "<=50K" "<=50K"
2329 2342 2355 2363 2364 2370 2374 2379 2380 2388 2399 2409 2416 2426 2429 2430 2434
"<=50K" "<=50K" "<=50K" "<=50K" "<=50K" "<=50K" "<=50K" "<=50K" "<=50K" "<=50K" "<=50K" "<=50K" ">50K" "<=50K" "<=50K" ">50K" "<=50K"
2439 2442 2455 2456 2458 2463 2467 2476 2482 2487 2488 2491 2492 2495 2497 2499 2501
"<=50K" "<=50K" "<=50K" "<=50K" "<=50K" "<=50K" ">50K" "<=50K" "<=50K" "<=50K" "<=50K" ">50K" ">50K" "<=50K" "<=50K" "<=50K" "<=50K"
2513 2522 2524 2525 2541 2545 2554 2563 2566 2573 2577 2581 2584 2585 2587 2590 2592
"<=50K" "<=50K" "<=50K" "<=50K" "<=50K" ">50K" "<=50K" "<=50K" "<=50K" "<=50K" "<=50K" "<=50K" "<=50K" "<=50K" ">50K" "<=50K" "<=50K"
2604 2606 2608 2614 2618 2621 2634 2641 2642 2657 2661 2665 2666 2669 2672 2674 2675
"<=50K" ">50K" "<=50K" "<=50K" "<=50K" "<=50K" "<=50K" "<=50K" "<=50K" "<=50K" "<=50K" ">50K" "<=50K" "<=50K" "<=50K" "<=50K" "<=50K"
2682 2683 2685 2692 2696 2699 2700 2703 2711 2719 2729 2732 2734 2741 2749 2758 2763
"<=50K" "<=50K" "<=50K" "<=50K" "<=50K" "<=50K" "<=50K" "<=50K" "<=50K" "<=50K" "<=50K" "<=50K" "<=50K" "<=50K" "<=50K" "<=50K" "<=50K"
2777 2783 2787 2791 2792 2797 2798 2801 2810 2815 2831 2834 2835 2841 2844 2846 2848
">50K" "<=50K" "<=50K" "<=50K" "<=50K" "<=50K" "<=50K" ">50K" "<=50K" "<=50K" "<=50K" "<=50K" "<=50K" "<=50K" ">50K" "<=50K" "<=50K"
2876 2882 2884 2885 2890 2894 2898 2903 2904 2908 2917 2922 2924 2928 2929 2932 2934
"<=50K" "<=50K" "<=50K" "<=50K" "<=50K" "<=50K" "<=50K" "<=50K" ">50K" "<=50K" ">50K" "<=50K" "<=50K" "<=50K" "<=50K" "<=50K" "<=50K"
2937 2960 2962 2965 2967 2970 2972 2976 2993 3006 3012 3017 3034 3036 3037 3041 3043
"<=50K" "<=50K" ">50K" "<=50K" ">50K" "<=50K" "<=50K" "<=50K" "<=50K" "<=50K" "<=50K" "<=50K" "<=50K" "<=50K" "<=50K" "<=50K" "<=50K"
3045 3052 3056 3059 3069 3073 3075 3083 3086 3087 3091 3098 3115 3120 3123 3126 3129
"<=50K" ">50K" ">50K" "<=50K" ">50K" "<=50K" "<=50K" "<=50K" "<=50K" "<=50K" "<=50K" "<=50K" ">50K" "<=50K" "<=50K" "<=50K" "<=50K"
3133 3140 3156 3158 3161 3174 3176 3179 3184 3191 3192 3199 3202 3210 3214 3217 3230
"<=50K" "<=50K" "<=50K" "<=50K" "<=50K" ">50K" "<=50K" "<=50K" "<=50K" "<=50K" "<=50K" "<=50K" "<=50K" "<=50K" ">50K" "<=50K" "<=50K"
3231 3236 3238 3245 3251 3254 3255 3256 3265 3275 3278 3280 3282 3300 3323 3333 3341
">50K" "<=50K" "<=50K" "<=50K" ">50K" "<=50K" "<=50K" "<=50K" "<=50K" "<=50K" "<=50K" "<=50K" ">50K" "<=50K" "<=50K" "<=50K" ">50K"
3347 3353 3367 3369 3380 3384 3385 3393 3396 3397 3398 3399 3416 3420 3422 3424 3425
"<=50K" ">50K" ">50K" "<=50K" "<=50K" "<=50K" "<=50K" "<=50K" "<=50K" "<=50K" "<=50K" "<=50K" "<=50K" "<=50K" ">50K" "<=50K" "<=50K"
3429 3430 3434 3436 3439 3440 3443 3450 3455 3460 3468 3469 3470 3471 3477 3481 3482
"<=50K" "<=50K" "<=50K" "<=50K" "<=50K" "<=50K" "<=50K" "<=50K" ">50K" "<=50K" "<=50K" ">50K" "<=50K" "<=50K" "<=50K" "<=50K" ">50K"
3483 3485 3487 3496 3499 3500 3501 3511 3516 3517 3521 3524 3528 3529 3534 3535 3546
"<=50K" "<=50K" "<=50K" "<=50K" ">50K" "<=50K" "<=50K" "<=50K" "<=50K" "<=50K" ">50K" "<=50K" "<=50K" "<=50K" "<=50K" "<=50K" "<=50K"
3561 3564 3572 3574 3577 3582 3588 3596 3612 3617 3623 3624 3634 3642 3643 3648 3651
"<=50K" "<=50K" "<=50K" "<=50K" "<=50K" "<=50K" "<=50K" "<=50K" ">50K" "<=50K" "<=50K" "<=50K" "<=50K" "<=50K" "<=50K" "<=50K" ">50K"
3654 3656 3660 3661 3667 3669 3684 3692 3695 3703 3710 3712 3713 3721 3726 3727 3733
"<=50K" "<=50K" "<=50K" "<=50K" "<=50K" "<=50K" "<=50K" "<=50K" "<=50K" "<=50K" "<=50K" "<=50K" "<=50K" "<=50K" "<=50K" "<=50K" "<=50K"
3740 3741 3749 3756 3760 3761 3762 3765 3769 3773 3783 3785 3787 3795 3801 3802 3805
"<=50K" "<=50K" "<=50K" "<=50K" "<=50K" "<=50K" "<=50K" "<=50K" "<=50K" "<=50K" "<=50K" "<=50K" ">50K" "<=50K" "<=50K" "<=50K" "<=50K"
3807 3811 3815 3822 3827 3833 3836 3846 3867 3869 3872 3885 3895 3915 3927 3938 3944
">50K" ">50K" "<=50K" "<=50K" "<=50K" "<=50K" "<=50K" "<=50K" "<=50K" "<=50K" "<=50K" "<=50K" ">50K" "<=50K" "<=50K" ">50K" "<=50K"
3945 3946 3950 3952 3954 3962 3963 3973 3985 3989 3990 3991 3994 4000 4004 4007 4010
"<=50K" "<=50K" "<=50K" ">50K" "<=50K" "<=50K" "<=50K" "<=50K" "<=50K" "<=50K" "<=50K" "<=50K" "<=50K" "<=50K" ">50K" "<=50K" "<=50K"
4012 4018 4028 4032 4036 4047 4054 4058 4062 4063 4072 4080 4089 4095 4103 4116 4117
"<=50K" ">50K" "<=50K" ">50K" "<=50K" "<=50K" "<=50K" "<=50K" "<=50K" "<=50K" "<=50K" "<=50K" "<=50K" "<=50K" "<=50K" "<=50K" "<=50K"
4121 4123 4125 4129 4140 4142 4149 4157 4159 4161 4163 4167 4171 4173 4176 4179 4184
">50K" "<=50K" "<=50K" "<=50K" "<=50K" "<=50K" "<=50K" "<=50K" "<=50K" "<=50K" "<=50K" "<=50K" "<=50K" "<=50K" "<=50K" "<=50K" ">50K"
4190 4204 4215 4217 4219 4222 4223 4239 4249 4259 4260 4271 4272 4277 4280 4289 4296
">50K" "<=50K" "<=50K" "<=50K" "<=50K" "<=50K" "<=50K" "<=50K" "<=50K" "<=50K" "<=50K" "<=50K" "<=50K" "<=50K" "<=50K" "<=50K" "<=50K"
4312 4315 4320 4322 4324 4326 4329 4330 4337 4339 4341 4345 4347 4351 4353 4362 4363
"<=50K" "<=50K" "<=50K" "<=50K" ">50K" "<=50K" "<=50K" "<=50K" "<=50K" "<=50K" "<=50K" ">50K" "<=50K" ">50K" "<=50K" "<=50K" ">50K"
4368 4369 4370 4371 4374 4379 4382 4395 4402 4403 4420 4426 4428 4429 4430 4435 4446
"<=50K" "<=50K" "<=50K" "<=50K" "<=50K" "<=50K" ">50K" "<=50K" "<=50K" "<=50K" "<=50K" "<=50K" "<=50K" ">50K" "<=50K" "<=50K" "<=50K"
4449 4451 4455 4461 4463 4464 4471 4477 4478 4479 4483 4484 4487 4490 4493 4500 4502
"<=50K" "<=50K" "<=50K" "<=50K" ">50K" "<=50K" "<=50K" "<=50K" "<=50K" "<=50K" "<=50K" "<=50K" "<=50K" "<=50K" ">50K" "<=50K" "<=50K"
4503 4504 4507 4511 4513 4514 4515 4516 4520 4521 4529 4530 4535 4555 4556 4559 4567
"<=50K" "<=50K" "<=50K" "<=50K" "<=50K" ">50K" "<=50K" "<=50K" "<=50K" "<=50K" "<=50K" "<=50K" ">50K" ">50K" "<=50K" ">50K" "<=50K"
4570 4573 4579 4584 4586 4587 4589 4597 4605 4608 4611 4615 4638 4650 4654 4656 4657
"<=50K" "<=50K" "<=50K" "<=50K" "<=50K" "<=50K" "<=50K" "<=50K" ">50K" "<=50K" ">50K" "<=50K" "<=50K" "<=50K" "<=50K" "<=50K" "<=50K"
4664 4677 4682 4690 4693 4704 4705 4709 4719 4720 4729 4732 4736 4754 4757 4760 4761
"<=50K" "<=50K" ">50K" ">50K" "<=50K" "<=50K" "<=50K" "<=50K" "<=50K" "<=50K" "<=50K" "<=50K" "<=50K" ">50K" ">50K" ">50K" "<=50K"
4779 4781 4788 4793 4795 4806 4807 4808 4824 4832 4840 4841 4844 4846 4848 4856 4869
"<=50K" "<=50K" "<=50K" "<=50K" "<=50K" "<=50K" "<=50K" "<=50K" "<=50K" "<=50K" ">50K" "<=50K" "<=50K" "<=50K" "<=50K" "<=50K" "<=50K"
4873 4874 4877 4890 4892 4900 4921 4922 4924 4943 4945 4947 4950 4953 4954 4973 4974
"<=50K" "<=50K" "<=50K" "<=50K" "<=50K" "<=50K" "<=50K" ">50K" "<=50K" "<=50K" "<=50K" "<=50K" "<=50K" "<=50K" ">50K" "<=50K" "<=50K"
4975 4977 4986 4995 5005 5011 5013 5016 5018 5026 5028 5032 5035 5036 5037 5042 5046
"<=50K" ">50K" "<=50K" "<=50K" "<=50K" "<=50K" ">50K" "<=50K" "<=50K" "<=50K" "<=50K" "<=50K" "<=50K" "<=50K" "<=50K" "<=50K" "<=50K"
5049 5052 5055 5063 5072 5080 5083 5086 5095 5096 5103 5106 5118 5125
"<=50K" "<=50K" "<=50K" "<=50K" "<=50K" "<=50K" ">50K" "<=50K" "<=50K" "<=50K" "<=50K" "<=50K" "<=50K" ">50K"
[ reached getOption("max.print") -- omitted 5033 entries ]
tab<-table(lm.pred,censusTest2$X)
tab
lm.pred <=50K >50K
<=50K 4166 966
>50K 365 536
accuracy<-sum(diag(tab))/sum(tab)
accuracy
[1] 0.7793801
roc<-performance(predict_log_roc,"tpr","fpr")
plot(roc)
performance(predict_log_roc, "auc")->auc
auc
A performance instance
'Area under the ROC curve'
auc<-auc@y.values[[1]]
auc
[1] 0.7832074
#Prediction building a Decision Tree Model: ## Divide the dataset into training and test sets in 70:30 ratio.
set.seed(123)
install.packages("caTools")
Error in install.packages : Updating loaded packages
library("caTools")
split_data<-sample.split(censusData,SplitRatio = 0.70)
censusTrain<-subset(censusData,split_data==T)
censusTest<-subset(censusData,split_data==F)
nrow(censusTrain)
[1] 20107
nrow(censusTest)
[1] 10055
# Build a decision tree model where the dependent variable is "X"(Yearly Income) and the rest of the variables as independent variables
library(rpart)
library(rpart.plot)
package 㤼㸱rpart.plot㤼㸲 was built under R version 3.6.3
census_model<-rpart(formula = X~.,
data = censusTrain,
method = "class")
library(rpart)
library(rpart.plot)
package 㤼㸱rpart.plot㤼㸲 was built under R version 3.6.3
rpart.plot(x= census_model, type= 5, extra = 0,tweak = 1.5)
class_prediction<-predict(census_model,
newdata = censusTest,
type = "class")
class_prediction
2 4 5 8 11 17 19 20 23 26 32 34 35 38 41 47 49 50 53 56 62 64
>50K <=50K >50K <=50K <=50K <=50K <=50K >50K <=50K <=50K <=50K <=50K <=50K <=50K <=50K <=50K >50K <=50K <=50K <=50K <=50K <=50K
65 68 71 77 79 80 83 86 92 94 95 98 101 107 109 110 113 116 122 124 125 128
<=50K <=50K <=50K <=50K <=50K <=50K <=50K <=50K <=50K >50K <=50K >50K <=50K <=50K <=50K <=50K >50K <=50K <=50K >50K <=50K <=50K
131 137 139 140 143 146 152 154 155 158 161 167 169 170 173 176 182 184 185 188 191 197
<=50K <=50K <=50K <=50K <=50K <=50K >50K <=50K <=50K <=50K <=50K >50K <=50K <=50K <=50K >50K <=50K >50K >50K >50K <=50K <=50K
199 200 203 206 212 214 215 218 221 227 229 230 233 236 242 244 245 248 251 257 259 260
<=50K <=50K >50K <=50K <=50K <=50K <=50K <=50K >50K <=50K <=50K <=50K <=50K <=50K <=50K <=50K <=50K <=50K <=50K <=50K >50K <=50K
263 266 272 274 275 278 281 287 289 290 293 296 302 304 305 308 311 317 319 320 323 326
<=50K >50K <=50K <=50K <=50K <=50K <=50K <=50K <=50K >50K <=50K <=50K >50K <=50K <=50K <=50K <=50K <=50K >50K <=50K <=50K <=50K
332 334 335 338 341 347 349 350 353 356 362 364 365 368 371 377 379 380 383 386 392 394
<=50K <=50K >50K <=50K <=50K <=50K <=50K <=50K <=50K <=50K <=50K <=50K <=50K <=50K >50K >50K <=50K <=50K <=50K <=50K <=50K <=50K
395 398 401 407 409 410 413 416 422 424 425 428 431 437 439 440 443 446 452 454 455 458
<=50K <=50K <=50K <=50K <=50K <=50K <=50K <=50K <=50K <=50K <=50K >50K <=50K <=50K <=50K <=50K <=50K <=50K >50K <=50K <=50K <=50K
461 467 469 470 473 476 482 484 485 488 491 497 499 500 503 506 512 514 515 518 521 527
<=50K <=50K <=50K <=50K <=50K <=50K <=50K >50K <=50K <=50K <=50K <=50K <=50K <=50K <=50K <=50K <=50K <=50K <=50K <=50K <=50K <=50K
529 530 533 536 542 544 545 548 551 557 559 560 563 566 572 574 575 578 581 587 589 590
<=50K <=50K <=50K <=50K <=50K <=50K >50K >50K <=50K >50K <=50K <=50K <=50K <=50K <=50K <=50K <=50K <=50K <=50K <=50K <=50K >50K
593 596 602 604 605 608 611 617 619 620 623 626 632 634 635 638 641 647 649 650 653 656
<=50K <=50K >50K <=50K <=50K <=50K <=50K <=50K <=50K >50K <=50K <=50K <=50K <=50K <=50K <=50K <=50K <=50K >50K <=50K <=50K <=50K
662 664 665 668 671 677 679 680 683 686 692 694 695 698 701 707 709 710 713 716 722 724
<=50K <=50K <=50K >50K <=50K <=50K <=50K >50K >50K >50K <=50K <=50K <=50K <=50K <=50K <=50K <=50K <=50K <=50K <=50K <=50K <=50K
725 728 731 737 739 740 743 746 752 754 755 758 761 767 769 770 773 776 782 784 785 788
<=50K >50K <=50K <=50K <=50K <=50K >50K <=50K <=50K <=50K <=50K <=50K <=50K <=50K <=50K <=50K <=50K <=50K <=50K <=50K <=50K <=50K
791 797 799 800 803 806 812 814 815 818 821 827 829 830 833 836 842 844 845 848 851 857
<=50K <=50K <=50K <=50K <=50K <=50K <=50K <=50K >50K <=50K <=50K >50K <=50K <=50K <=50K <=50K <=50K >50K <=50K <=50K <=50K <=50K
859 860 863 866 872 874 875 878 881 887 889 890 893 896 902 904 905 908 911 917 919 920
<=50K <=50K >50K <=50K <=50K <=50K <=50K <=50K <=50K >50K <=50K <=50K >50K <=50K <=50K <=50K <=50K <=50K <=50K <=50K <=50K <=50K
923 926 932 934 935 938 941 947 949 950 953 956 962 964 965 968 971 977 979 980 983 986
<=50K <=50K <=50K <=50K <=50K <=50K <=50K <=50K <=50K <=50K <=50K <=50K <=50K <=50K >50K >50K >50K <=50K <=50K <=50K <=50K <=50K
992 994 995 998 1001 1007 1009 1010 1013 1016 1022 1024 1025 1028 1031 1037 1039 1040 1043 1046 1052 1054
<=50K <=50K <=50K <=50K <=50K <=50K <=50K <=50K <=50K <=50K >50K >50K >50K <=50K >50K <=50K <=50K <=50K <=50K <=50K <=50K <=50K
1055 1058 1061 1067 1069 1070 1073 1076 1082 1084 1085 1088 1091 1097 1099 1100 1103 1106 1112 1114 1115 1118
<=50K <=50K <=50K >50K <=50K <=50K >50K >50K <=50K <=50K <=50K >50K <=50K <=50K <=50K <=50K <=50K <=50K <=50K <=50K <=50K >50K
1121 1127 1129 1130 1133 1136 1142 1144 1145 1148 1151 1157 1159 1160 1163 1166 1172 1174 1175 1178 1181 1187
<=50K <=50K >50K >50K <=50K <=50K <=50K <=50K <=50K <=50K <=50K >50K <=50K <=50K <=50K <=50K <=50K >50K >50K <=50K >50K <=50K
1189 1190 1193 1196 1202 1204 1205 1208 1211 1217 1219 1220 1223 1226 1232 1234 1235 1238 1241 1247 1249 1250
>50K <=50K <=50K <=50K <=50K <=50K <=50K <=50K >50K <=50K <=50K <=50K <=50K <=50K <=50K <=50K <=50K >50K <=50K <=50K <=50K <=50K
1253 1256 1262 1264 1265 1268 1271 1277 1279 1280 1283 1286 1292 1294 1295 1298 1301 1307 1309 1310 1313 1316
<=50K <=50K <=50K <=50K <=50K <=50K <=50K <=50K <=50K <=50K >50K <=50K <=50K <=50K <=50K <=50K <=50K <=50K <=50K >50K <=50K <=50K
1322 1324 1325 1328 1331 1337 1339 1340 1343 1346 1352 1354 1355 1358 1361 1367 1369 1370 1373 1376 1382 1384
<=50K <=50K <=50K <=50K <=50K <=50K >50K >50K <=50K <=50K <=50K <=50K <=50K <=50K <=50K <=50K <=50K <=50K >50K <=50K <=50K <=50K
1385 1388 1391 1397 1399 1400 1403 1406 1412 1414 1415 1418 1421 1427 1429 1430 1433 1436 1442 1444 1445 1448
<=50K <=50K >50K <=50K >50K <=50K <=50K <=50K <=50K <=50K <=50K <=50K <=50K <=50K <=50K <=50K <=50K <=50K <=50K <=50K <=50K <=50K
1451 1457 1459 1460 1463 1466 1472 1474 1475 1478 1481 1487 1489 1490 1493 1496 1502 1504 1505 1508 1511 1517
<=50K <=50K <=50K <=50K <=50K <=50K <=50K <=50K <=50K <=50K <=50K <=50K <=50K <=50K <=50K <=50K <=50K >50K >50K >50K <=50K <=50K
1519 1520 1523 1526 1532 1534 1535 1538 1541 1547 1549 1550 1553 1556 1562 1564 1565 1568 1571 1577 1579 1580
<=50K <=50K <=50K >50K <=50K >50K <=50K <=50K <=50K <=50K <=50K <=50K <=50K >50K >50K <=50K <=50K <=50K <=50K >50K <=50K <=50K
1583 1586 1592 1594 1595 1598 1601 1607 1609 1610 1613 1616 1622 1624 1625 1628 1631 1637 1639 1640 1643 1646
<=50K <=50K <=50K <=50K >50K <=50K <=50K <=50K <=50K <=50K <=50K <=50K >50K <=50K >50K >50K <=50K <=50K <=50K >50K <=50K <=50K
1652 1654 1655 1658 1661 1667 1669 1670 1673 1676 1682 1684 1685 1688 1691 1697 1699 1700 1703 1706 1712 1714
<=50K <=50K >50K >50K <=50K >50K <=50K <=50K >50K <=50K <=50K <=50K <=50K <=50K <=50K <=50K <=50K <=50K <=50K <=50K <=50K <=50K
1715 1718 1721 1727 1729 1730 1733 1736 1742 1744 1745 1748 1751 1757 1759 1760 1763 1766 1772 1774 1775 1778
<=50K <=50K <=50K >50K <=50K >50K <=50K <=50K >50K <=50K >50K <=50K <=50K <=50K <=50K >50K >50K <=50K <=50K <=50K <=50K <=50K
1781 1787 1789 1790 1793 1796 1802 1804 1805 1808 1811 1817 1819 1820 1823 1826 1832 1834 1835 1838 1841 1847
>50K <=50K >50K <=50K <=50K >50K <=50K <=50K <=50K >50K <=50K >50K <=50K >50K <=50K <=50K <=50K <=50K <=50K <=50K >50K <=50K
1849 1850 1853 1856 1862 1864 1865 1868 1871 1877 1879 1880 1883 1886 1892 1894 1895 1898 1901 1907 1909 1910
<=50K <=50K <=50K <=50K <=50K <=50K <=50K <=50K <=50K <=50K <=50K <=50K >50K <=50K <=50K <=50K >50K <=50K >50K <=50K <=50K <=50K
1913 1916 1922 1924 1925 1928 1931 1937 1939 1940 1943 1946 1952 1954 1955 1958 1961 1967 1969 1970 1973 1976
<=50K <=50K <=50K <=50K <=50K >50K <=50K <=50K <=50K >50K <=50K <=50K <=50K <=50K <=50K <=50K <=50K <=50K <=50K <=50K <=50K <=50K
1982 1984 1985 1988 1991 1997 1999 2000 2003 2006 2012 2014 2015 2018 2021 2027 2029 2030 2033 2036 2042 2044
<=50K >50K >50K <=50K <=50K <=50K <=50K >50K <=50K <=50K <=50K <=50K <=50K >50K <=50K <=50K <=50K <=50K <=50K <=50K <=50K <=50K
2045 2048 2051 2057 2059 2060 2063 2066 2072 2074 2075 2078 2081 2087 2089 2090 2093 2096 2102 2104 2105 2108
<=50K <=50K >50K <=50K <=50K <=50K <=50K <=50K <=50K <=50K <=50K <=50K <=50K <=50K <=50K >50K <=50K <=50K <=50K <=50K <=50K <=50K
2111 2117 2119 2120 2123 2126 2132 2134 2135 2138 2141 2147 2149 2150 2153 2156 2162 2164 2165 2168 2171 2177
<=50K <=50K <=50K <=50K >50K <=50K <=50K <=50K >50K >50K <=50K <=50K >50K <=50K <=50K >50K <=50K <=50K <=50K >50K <=50K <=50K
2179 2180 2183 2186 2192 2194 2195 2198 2201 2207 2209 2210 2213 2216 2222 2224 2225 2228 2231 2237 2239 2240
<=50K <=50K <=50K <=50K <=50K >50K >50K <=50K <=50K <=50K <=50K >50K <=50K <=50K >50K <=50K <=50K <=50K <=50K >50K <=50K <=50K
2243 2246 2252 2254 2255 2258 2261 2267 2269 2270 2273 2276 2282 2284 2285 2288 2291 2297 2299 2300 2303 2306
<=50K <=50K <=50K <=50K <=50K <=50K <=50K >50K <=50K <=50K <=50K <=50K <=50K >50K <=50K <=50K <=50K <=50K <=50K <=50K <=50K >50K
2312 2314 2315 2318 2321 2327 2329 2330 2333 2336 2342 2344 2345 2348 2351 2357 2359 2360 2363 2366 2372 2374
<=50K >50K >50K <=50K <=50K <=50K <=50K <=50K <=50K <=50K <=50K <=50K <=50K <=50K >50K <=50K <=50K <=50K >50K <=50K <=50K <=50K
2375 2378 2381 2387 2389 2390 2393 2396 2402 2404 2405 2408 2411 2417 2419 2420 2423 2426 2432 2434 2435 2438
>50K >50K >50K <=50K <=50K >50K <=50K <=50K <=50K <=50K <=50K <=50K <=50K <=50K <=50K <=50K <=50K <=50K <=50K <=50K <=50K >50K
2441 2447 2449 2450 2453 2456 2462 2464 2465 2468 2471 2477 2479 2480 2483 2486 2492 2494 2495 2498 2501 2507
<=50K <=50K <=50K <=50K <=50K <=50K <=50K >50K >50K <=50K >50K <=50K <=50K <=50K >50K <=50K >50K <=50K <=50K <=50K <=50K <=50K
2509 2510 2513 2516 2522 2524 2525 2528 2531 2537 2539 2540 2543 2546 2552 2554 2555 2558 2561 2567 2569 2570
<=50K <=50K <=50K <=50K <=50K <=50K <=50K <=50K <=50K <=50K <=50K <=50K <=50K <=50K <=50K <=50K <=50K <=50K <=50K <=50K >50K <=50K
2573 2576 2582 2584 2585 2588 2591 2597 2599 2600 2603 2606 2612 2614 2615 2618 2621 2627 2629 2630 2633 2636
<=50K <=50K <=50K <=50K <=50K <=50K <=50K >50K <=50K <=50K <=50K <=50K <=50K <=50K >50K <=50K <=50K <=50K <=50K <=50K <=50K <=50K
2642 2644 2645 2648 2651 2657 2659 2660 2663 2666 2672 2674 2675 2678 2681 2687 2689 2690 2693 2696 2702 2704
<=50K <=50K <=50K <=50K <=50K <=50K <=50K <=50K <=50K <=50K <=50K <=50K <=50K >50K >50K <=50K <=50K <=50K <=50K <=50K <=50K <=50K
2705 2708 2711 2717 2719 2720 2723 2726 2732 2734 2735 2738 2741 2747 2749 2750 2753 2756 2762 2764 2765 2768
<=50K <=50K >50K <=50K <=50K <=50K <=50K <=50K <=50K <=50K <=50K <=50K <=50K <=50K <=50K <=50K <=50K <=50K <=50K <=50K <=50K <=50K
2771 2777 2779 2780 2783 2786 2792 2794 2795 2798 2801 2807 2809 2810 2813 2816 2822 2824 2825 2828 2831 2837
>50K >50K <=50K <=50K >50K <=50K <=50K <=50K <=50K <=50K >50K <=50K <=50K <=50K <=50K <=50K >50K <=50K <=50K <=50K <=50K <=50K
2839 2840 2843 2846 2852 2854 2855 2858 2861 2867 2869 2870 2873 2876 2882 2884 2885 2888 2891 2897 2899 2900
<=50K >50K >50K <=50K <=50K >50K >50K >50K <=50K <=50K <=50K <=50K <=50K <=50K <=50K <=50K <=50K >50K <=50K >50K >50K <=50K
2903 2906 2912 2914 2915 2918 2921 2927 2929 2930 2933 2936 2942 2944 2945 2948 2951 2957 2959 2960 2963 2966
<=50K <=50K <=50K <=50K <=50K >50K <=50K >50K <=50K <=50K <=50K <=50K <=50K <=50K <=50K <=50K <=50K <=50K <=50K <=50K >50K <=50K
2972 2974 2975 2978 2981 2987 2989 2990 2993 2996
<=50K <=50K <=50K <=50K >50K <=50K <=50K <=50K <=50K >50K
[ reached getOption("max.print") -- omitted 9055 entries ]
Levels: <=50K >50K
#TP FP #FN TN #TP TN -correctly predicted #FP FN - wrongly predicted # Build a confusion matrix and calculate the accuracy
tab<-table(class_prediction,censusTest$X)
tab
class_prediction <=50K >50K
<=50K 7185 1256
>50K 363 1251
sum(diag(tab))/sum(tab)
[1] 0.8389856
#Prediction building a random Forest: ## let’s build a random forest model ## Divide the dataset into training and test sets in 80:20 ratio
set.seed(123)
install.packages("caTools")
Error in install.packages : Updating loaded packages
library("caTools")
split_data<-sample.split(censusData$X,SplitRatio = 0.8)
censusTrain<-subset(censusData,split_data==T)
censusTest<-subset(censusData,split_data==F)
nrow(censusTrain)
[1] 24129
nrow(censusTest)
[1] 6033
library(randomForest)
census_model<-randomForest(formula=X~.,
data=censusTrain,
ntree=300)
plot(census_model)
text(census_model)
Error in xy.coords(x, y, recycle = TRUE, setLab = FALSE) :
'x' is a list, but does not have components 'x' and 'y'
cenus_prediction<-predict(census_model,
newdata = censusTest,
type = "class")
tab<-table(cenus_prediction,censusTest$X)
tab
cenus_prediction <=50K >50K
<=50K 4208 503
>50K 323 999
sum(diag(tab))/sum(tab)
[1] 0.8630864